import requests
from lxml import etree
import re
import json
import time

# 获取100页的标题和详情页地址等
filePath = "tiebaAll.txt"
urlStart = "https://tieba.baidu.com/f?kw=芜湖职业技术学院&ie=utf-8&pn="
headers = {
    'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36",
    'Cookie':'BIDUPSID=E53ECC6A6B63A21497AD16751AF801ED; PSTM=1678155848; BAIDUID=E53ECC6A6B63A214DD1A0672D0BE832F:FG=1; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; BAIDUID_BFESS=E53ECC6A6B63A214DD1A0672D0BE832F:FG=1; BA_HECTOR=2g25012000012g208121al1p1i124e91n; ZFY=u18uCCKF3eiZAvELm6:AP7NqLlmkHKkoW4FK5hFa:AobQ:C; H_PS_PSSID=36551_38270_38355_38368_37861_38173_38289_38377_37933_38316_38383_38285_26350_37881; delPer=0; BAIDU_WISE_UID=wapp_1678862099914_793; USER_JUMP=-1; Hm_lvt_98b9d8c2fd6608d564bf2ac2ae642948=1678862104; st_key_id=17; __bid_n=186e3fabfa01c63a7c4207; arialoadData=false; FPTOKEN=2UA/wIKUQNFtEHY6XYmHubebzjSruwI9b6+SgTx+W5EzUS8Qj8PtHlDSwlT+bHMnxhL/IMLStGBSzBO7OM/tSEgzfUiCUV+jiw1sby7tBTNH3PNe42gMfoiLqxWvmsI8gKi8QJaQNvVCh960/lGtirvzxu/3zphCNhkLQQtoC88OVdqLHCUTrFJwJBe+ygYGEXBsrTuIq0JUxnH0CdinaB4MeKNQCQ2icfxTRL+w5qMyDJFb0BR79zptB30L8JTs13UoReuq85iHmJVDl0qSi4vwQFKnA9ouv/67D0Xx/eWEu9OKYFuEt6Y57sawxndF700ojWh3msMeaGJSuWOBAtJ0zmxPIKKb6bfh/Ws1+ydTeDUWeHCD4wmk3UeomWQkIBOkeLku0jl8nsTN30MUOg==|OQPcUIJo29gHr3gphQ6JhPlZ+IRFvfAjn3hzjQc/B1A=|10|d12c5091aa1dfb1945fb3ace9250c41a; wise_device=0; Hm_lvt_287705c8d9e2073d13275b18dbd746dc=1678862133; video_bubble0=1; Hm_lpvt_287705c8d9e2073d13275b18dbd746dc=1678865572; TIEBAUID=cb23caae14130a0d384a57f1; IS_NEW_USER=697672d850eaef8d8f49dc2f; BCLID=12310101485996257920; BCLID_BFESS=12310101485996257920; BDSFRCVID=Us4OJeCT5G09-d6fLqzRrgQPJHILeajTTPjcTR5qJ04BtyCVcmiREG0PtDqAucFM_EGSogKKWmOTH7uF_2uxOjjg8UtVJeC6EG0Ptf8g0M5; BDSFRCVID_BFESS=Us4OJeCT5G09-d6fLqzRrgQPJHILeajTTPjcTR5qJ04BtyCVcmiREG0PtDqAucFM_EGSogKKWmOTH7uF_2uxOjjg8UtVJeC6EG0Ptf8g0M5; H_BDCLCKID_SF=tbIJoDK5JDD3fP36q45HMt00qxby26nP-bn9aJ5nQI5nhKIzb5jtXb_h0l5MQlOZWg5r0Rj_QUbmjRO206oay6O3LlO83h52aC5NKl0MLPbUqqjG-UjYBUL10UnMBMnUamOnaI3a3fAKftnOM46JehL3346-35543bRTLnLy5KJYMDcnK4-XDTvQeaQP; H_BDCLCKID_SF_BFESS=tbIJoDK5JDD3fP36q45HMt00qxby26nP-bn9aJ5nQI5nhKIzb5jtXb_h0l5MQlOZWg5r0Rj_QUbmjRO206oay6O3LlO83h52aC5NKl0MLPbUqqjG-UjYBUL10UnMBMnUamOnaI3a3fAKftnOM46JehL3346-35543bRTLnLy5KJYMDcnK4-XDTvQeaQP; PSINO=7; tb_as_data=114f8b541b5964a3b3981bd82be61e0c5093696b45081877a9b9c91052f7c1b4b5b18dfc5dbda6cade4334fd315fbd8cf41acef767030eacc2b1c4c505368adf527fe2143be047e27cdef071f9ecd943a782b69cafc1a1d77c786f314ef17adc; Hm_lpvt_98b9d8c2fd6608d564bf2ac2ae642948=1678868929; XFI=693faa30-c30b-11ed-a12d-0528059d0f38; ab_sr=1.0.1_YjA1ZGRlODMzYmRkZDE3MWJkYmFmYmQyOGFiMWMzZDVhMTMxOWJmZmM1MjU4Mjc3YmJjYzdkMzBhNDM2ODRlY2M3YTg1MDQwN2Q5YmY4YmI2MTBjNTMyNGU0NGNmNjdkYTMxYmI3NzljZGYxY2E1NWI4NzkyMGNmNDYzOWQ2NmY1OGQ2MmMxZDI4YmVhNGFiMGY3NGMxMzhmZDBiOGQ2Mg==; st_data=e1b81dcd5b5938eb1cc1f3d67c9ec51bd5d5feba5c76ad20d6c9aa9e61a247d2714e0da5e427fa097658a0051b5bdecf932209cdfd296669ffd9e40f477690ebc64d6b6bef050f9e91bc6b63b21bd661eace7533c84fc5909a9d1416a8c24892; st_sign=acfa42a9; XFCS=29DBDB46E4EF21E2567DF3A5306F4A56B32BD93D26A20AD766E5436A136FBA06; XFT=tZ5/bd+ElVTTa/o4+2TdS1/HKxOUrJv9SW5lwf68tyQ=; RT="z=1&dm=baidu.com&si=f1dbcdd7-6111-4021-92a0-9f1382204468&ss=lf9f7nl3&sl=0&tt=0&bcn=https%3A%2F%2Ffclog.baidu.com%2Flog%2Fweirwood%3Ftype%3Dperf&ld=3up&ul=hmv'
}

# 第1页： https://tieba.baidu.com/f?kw=芜湖职业技术学院&ie=utf-8&pn=0
# 第100页： https://tieba.baidu.com/f?kw=芜湖职业技术学院&ie=utf-8&pn=4950

for x in range(0,5000,50):
    url = urlStart + str(x)

    response = requests.get(url=urlStart, headers=headers)

    if response.encoding == "ISO-8859-1":
        response.encoding = response.apparent_encoding

    html = response.text

    start = '<ul id="thread_list"'
    end = '<div class="thread_list_bottom clearfix">'

    r5 = re.findall(start + ".*?" + end, html, re.DOTALL)
    text = r5[0]
    rootElement = etree.HTML(text)

    li_list = rootElement.xpath('//ul[@id="thread_list"]/li')

    tieziList = []
    for li in li_list:
        tiezi = {}
        # 回复数
        applyNum = li.xpath(".//span[@class='threadlist_rep_num center_text']/text()")[0]
        tiezi['applyNum'] = applyNum
        # 标题
        tieziTitle = li.xpath(".//div[@class='threadlist_title pull_left j_th_tit ']/a/text()")[0]
        tiezi['tieziTitle'] = tieziTitle
        # 详情地址
        detailHref = li.xpath(".//div[@class='threadlist_title pull_left j_th_tit ']/a/@href")[0]
        tiezi['detailHref'] = detailHref
        # 发帖人
        fatie = li.xpath(".//span[@class='frs-author-name-wrap']/a/text()")[0]
        tiezi['fatie'] = fatie
        # 日期时间
        lastDatetime = li.xpath(".//span[@class='threadlist_reply_date pull_right j_reply_data']/text()")[0] if len(
            li.xpath(".//span[@class='threadlist_reply_date pull_right j_reply_data']/text()")) > 0 else None
        tiezi['lastDatetime'] = lastDatetime.strip() if lastDatetime else ""
        # print(tiezi)
        tieziList.append(tiezi)
    # 休息一会
    time.sleep(2)
    print("处理第" + str(x/50) + "页")
    with open(filePath, "a", encoding="utf-8") as f:
        for tiezi in tieziList:
            f.write(tiezi["applyNum"] + "\t" + tiezi["tieziTitle"] + "\t"
                    + tiezi["detailHref"] + "\t" + tiezi["fatie"] + "\t" + tiezi["lastDatetime"] + "\n")

